
### Project: IADB Government Payroll Analytics - Country
### Project leader: Dr Christian Schuster
### Code author (s): Robert Lipiński
### Date last update: (run below or see 'exec_time.csv')
file.info(rstudioapi::getActiveDocumentContext()$path)$mtime

### Script purpose: Selects and combines all variables needed for the final indicators; 
### re-calculates any variables given changes in the previous scripts (e.g. adjust salary bands).
### Optionally, the user can also re-run any previous R scripts first to update the data


### Execution time: ~15 minutes (see 'exec_time.csv')

### Inputs: 
# 1) /data/intermediate/country_07_limpiar_cubertura.[format1]
# 1) /data/intermediate/country_08_limpiar_ascensos.[format1]
# 1) /data/intermediate/country_09_limpiar_rotacion.[format1]


### Outputs:
# 1) /data/clean/country_full_final[format1]

#
# SET-UP --------------------------------------------------------------------------------------------
#


### Source the '00_global.R' script with required packages and functions
Sys.sleep(0.5)  # Wait for half a second
source(file.path(dirname(rstudioapi::getActiveDocumentContext()$path), '00_country_global.R'))




# ' ------------------------------------------------------------------------------------------------------------------------------------------------------
# (*) RE-RUN PREVIOUS SCRIPTS -----------------------------------------------------------------------------------------------------------------------
#

t0 = Sys.time() # record start time

### I needed we can re run all the scripts to update the data from here
anew_all_scripts = F


if(anew_all_scripts){
  
  # creating raw files to work with
  source(list.files(dirname(rstudioapi::getActiveDocumentContext()$path), pattern = "^01.*\\.R$", full.names = TRUE))
  source(list.files(dirname(rstudioapi::getActiveDocumentContext()$path), pattern = "^02.*\\.R$", full.names = TRUE))

  # actual cleaning
  source(list.files(dirname(rstudioapi::getActiveDocumentContext()$path), pattern = "^03.*\\.R$", full.names = TRUE))
  source(list.files(dirname(rstudioapi::getActiveDocumentContext()$path), pattern = "^04.*\\.R$", full.names = TRUE))
  source(list.files(dirname(rstudioapi::getActiveDocumentContext()$path), pattern = "^05.*\\.R$", full.names = TRUE))
  source(list.files(dirname(rstudioapi::getActiveDocumentContext()$path), pattern = "^06.*\\.R$", full.names = TRUE))
  source(list.files(dirname(rstudioapi::getActiveDocumentContext()$path), pattern = "^07.*\\.R$", full.names = TRUE))
  source(list.files(dirname(rstudioapi::getActiveDocumentContext()$path), pattern = "^08.*\\.R$", full.names = TRUE))
  source(list.files(dirname(rstudioapi::getActiveDocumentContext()$path), pattern = "^09.*\\.R$", full.names = TRUE))
}


# ' ------------------------------------------------------------------------------------------------------------------------------------------------------
# GATHER REQUIRED VARIABLES -----------------------------------------------------------------------------------------------------------------------
#

### To speed up calculations some variables are saved in separate files created by scripts 07-09. Here we define which variables
### are needed for the final indicators; extract them from the correct data files; and combine them

### list all column names
col_names = sort(unique(c(
  names(open_dataset(file.path(main_dir, 'data', 'intermediate', "country_07_limpiar_cubertura.parquet"))),
  names(open_dataset(file.path(main_dir, 'data', 'intermediate', "country_08_limpiar_ascensos.parquet"))),
  names(open_dataset(file.path(main_dir, 'data', 'intermediate', "country_09_limpiar_rotacion.parquet")))
)))

### list needed column names - either the ones used in the remaining scripts OR present in the instructions file

col_later = c()
for(i in list.files(path = file.path(main_dir, 'code'), pattern = "^10|^11|^12|^13)",  full.names = F)){
  # get columns used in each script
  script_cols  = col_names[sapply(col_names, function(c) any(grepl(c, tolower(readLines(file.path(main_dir, 'code', i))))))]
  # add them to the list of columns used in all higher-up scripts
  col_later = sort(unique(c(col_later, script_cols)))
}
col_later

col_instructions  = col_names[sapply(col_names, function(c) any(grepl(c, tolower(country_instructions))))]

### combine
col_used = unique(c(col_later, col_instructions))

### remove if anything spare included
col_used[!col_used %in% c('organismo_nombre')]

### add if anything extra useufl
col_used = sort(unique(c(col_used, 'anyo_mes')))


### use country_07_limpiar_cubertura.parquet as baseline
col_cubertura = col_used[col_used %in% names(open_dataset(file.path(main_dir, 'data', 'intermediate', "country_07_limpiar_cubertura.parquet")))]
country = read_flex(file.path(main_dir, 'data', 'intermediate', "country_07_limpiar_cubertura.parquet"),
                  col_select = col_cubertura)
gc()



# if row_id not unique - ran unique() on the whole dataset
if(nrow(country) != uniqueN(country$row_id_org)){country = unique(country)}

### +cols from country_08_limpiar_ascensos.parquet
col_ascensos = col_used[(col_used %in% names(open_dataset(file.path(main_dir, 'data', 'intermediate', "country_08_limpiar_ascensos.parquet")))) &
                          (!col_used %in% names(country))]

country_ascensos = read_flex(file.path(main_dir, 'data', 'intermediate', "country_08_limpiar_ascensos.parquet"),
                  col_select = c('row_id_org', col_ascensos))
gc()

# if row_id not unique - ran unique() on the whole dataset
if(nrow(country_ascensos) != uniqueN(country_ascensos$row_id_org)){country_ascensos = unique(country_ascensos)}

# <> combine
# setindex(country, row_id_org)
# setindex(country_ascensos, row_id_org)

gc()

country = country_ascensos[country, on = .(row_id_org)]
rm(country_ascensos)
gc()
beep()



### +cols from country_09_limpiar_rotacion.parquet
col_rotacion = col_used[(col_used %in% names(open_dataset(file.path(main_dir, 'data', 'intermediate', "country_09_limpiar_rotacion.parquet")))) &
                          (!col_used %in% names(country))]
country_rotacion = read_flex(file.path(main_dir, 'data', 'intermediate', "country_09_limpiar_rotacion.parquet"),
                           col_select = c('row_id_org', col_rotacion))

# if row_id not unique - ran unique() on the whole dataset
if(nrow(country_rotacion) != uniqueN(country_rotacion$row_id_org)){country_rotacion = unique(country_rotacion)}


gc()
country = country_rotacion[country, on = .(row_id_org)]
gc()
rm(country_rotacion)


### xxx temp 
# country = country %>% rename(pago_bruto = any_of('pago_bruta'))

# (*)checks -> see if all columns used now in the 'country' dataframe
sf(col_used %in% names(country))
col_used[!col_used %in% names(country)]

country[, fmean(pago_bruto[genero == 'hombre'])/fmean(pago_bruto[genero == 'mujer']), by = .(anyo)]
beep()




### '  ----------------------------------------------------------------------------------------------------------------------

#
# RE-CALCULATE VARIABLES OVER THE NEWLY FILTERED DATA --------------------------------------------------------------------------------------
#

# NOTE: Some variables are, like salary bands, are calculated over the full sample distribution and since the full sample
# change, we need to re-calculate/only calculate them now



### + bandos salarial --------------------------------------------------------------------------------------------------------------------------------
## note: needs to be done after having final set of observations as this might affect the calculations
## of the quantiles

country[, banda_salarial := NULL]

pago_bands1 <- c(0, (quantile(country$pago_bruto, probs = c(0.2, .4, .6, .8,
                                                                   #.95, .99, 
                                                                   1), na.rm=T)))
options(scipen = 999) # temporarily disable scientific notation

### define band labels in the format "band_[min]_[max]"
# pago_bands_labels1 <- paste0("band_", head(pago_bands1, -1), "_", pago_bands1[-1])

### define pay bands descriptivelu (in Spanish)
pago_bands_labels1 = c('primero quintil salarial (el más bajo)',
                       'segundo quintil salarial',
                       'tercer quintil salarial',
                       'cuarto quintil salarial',
                       'quinto quintil salarial (el más alto)')

### create bands
country = country %>%
  select(-c(matches('banda_salarial'))) %>% 
  mutate(banda_salarial = cut(
    pago_bruto,
    breaks = pago_bands1, # use quantile values as breaks
    labels = pago_bands_labels1, # label each quantile group
    right = F # using F results in more equally distributed quintiles, even though, due to clustering at certain pay scales, they are still unequal in size
  ),
  # banda_salarial2 = ntile(pago, 5) # cuts into perfectly sized bins, but at the cost of putting some people getting the same pay into different bins
  )

### checks -> do 'pago_bruto' values match the desired quantile ranges?
# tapply(country_rotacion$pago_bruto, country_rotacion$banda_salarial, summary)


### pay gap
country[, fmean(pago_bruto[genero == 'hombre'])/fmean(pago_bruto[genero == 'mujer']), by = .(anyo)]


# ' ------------------------------------------------------------------------------------------------------------------------------------------------------
# FINAL CLEAN  --------------------------------------------------------------------------------------------------------------------------------------------
#

# 
# use_sample <- as.logical(select.list(choices = c("TRUE", "FALSE"),
#                                      title = "Use 10% sample (T/F)?",
#                                      graphics = TRUE))
# 
# if(use_sample){
#   country = read_flex(file.path(main_dir, 'data', 'intermediate', "country_08_limpiar_ascensos (sample10)"), format = format1)
# }else{
#   country = read_flex(file.path(main_dir, 'data', 'intermediate', "country_08_limpiar_ascensos"), format = format1)
# }


### rename ----------------------------------------------------------------------------------------------------------------------------------------------
print('renaming cols')

country = country %>% 
  rename(
    person_id = any_of('id_full'),
    puesto_nombre = any_of('tipo_cargo_clean'),
    sectorial_nombre = any_of('organismo_sector_comprimido'),
    entidad_nombre = any_of('organismo_nombre_clean'),
    entidad_codigo = any_of('organismo_codigo'),
    genero = any_of('genero_comprimido'),
    contract_type = any_of('dataset')
  ) %>% 
  # add columns still missing (can be moved to previous scripts too)
  mutate(fte = 1/12,
         grado_directivo = ifelse(str_detect(grado_directivo, '^1|^direct|^geren'),
                                   'directivo o gerente', 'no directivo o gerente'),
         grado_dummy = ifelse(grado == 'estamento no definido', NA, grado),
         contract_type_dummy = ifelse(contract_type == 'planta', 'permanente', 'temporario'),
         country = 'country'
  )


### As per Christian's email from 07/08/2025 19:29 
### "can you add (incl. honorarios) to those who don’t have an estamento? (as that is the largest share of those) So call them “Estamento no definido (incl. honorarios)"
## rank = sin estamento + honrarios
## ROBER (28/09/2025): adding also 'codigo de trabajo' as this contract type also has no rank defined
country[, grado := fifelse(grado == 'estamento no definido', 'estamento no definido (incl. honorarios y codigo de trabajo)', grado)]


## As per Christian's email from 05/08/2025 11:17)
### rename subsecretaria para las fuerzas armadas for the dashboard (not survey)
country$entidad_nombre[country$entidad_nombre == 'subsecretaria para las fuerzas armadas (ssffaa)'] = tolower('Subsecretaria de las Fuerzas Armadas (sin uniformados)')

funique(country$entidad_nombre[str_detect(country$entidad_nombre, 'las fuerz')])


### exclude those (kept there for survey report)
# country = country[!(entidad_nombre %in% c('direccion de prevision de carabineros de country (dipreca)',
#                                       'fiscalia nacional economica (fne)',
#                                       'fiscalia de obras publicas'))]


### leave unique org-anyo combinations --------------------------------------------------------------------------------
# NOTE: we only want to keep org-year combinations that occur in the dataset after filtering those
# combinations with <10 unique IDs (see script 06). Otherwise the complete() function in the 
# summary function below will extend the values to combinations we don't want


n_min = 10 # Christian email from 29/07/2025 (11:42) -> "let's filter out anyone with fewer than 10 unique IDs in a year"
country[, org_id_n := uniqueN(person_id), by = .(anyo, entidad_nombre)]
sf(country$org_id_n < 10)


# country = country[org_id_n >= n_min, ]



### + monthly payments -------------------------------------------------------------------------------------------------------------

### Christian's email from 29/07/2025 (06:52): "Multiple payments: super interesting how large this number is. So we want
# to report the number and share here, for the three categories you lay out (split in the usual ways), and that should be it."
# [3 categories laid our are single, double and 3+ payments, see Robert's email from 28/07/2025 (19:37)]

### all payments containing pago_bruto or some only one of the extra payments?
## R: Extremely few - almost 99% of obs are non-missing and non-zero for pago_bruto
## while 81.2% are 0 on extra payments (aggregated from all types of additional payment types)
prop.table(table(country$pago_bruto == 0, country$pago_extra_total == 0, useNA = 'ifany'))



### calculate number of observations of each ID per month
country[, multi_n := .N, by = .(person_id, anyo, mes)]
country[, multi_n := fifelse(multi_n >= 3, "3 o más", as.character(multi_n))]
country[, multi_anyo := fifelse(any(multi_n > 1), 
                              "pago múltiple en al menos un mes",
                              'ningún pago múltiple'), by = .(anyo, person_id)]


## checks > distribution - overall + by key position
country$multi_n %>% pr_na # only ~4.4% in 3+ category
tapply(country$multi_n, country$puesto_clave, pr_na) # doctors and nurses 'moonlighting' the most (9% and 6% respectively)




### (*)checks > gender gap
## NOTE: the script at one point was feeding back the changes to the raw file and producing non-sensical and unstable 
## values. This is sanity check - gender gaps should be ~1.20-24 for all years (in favor of men)

### pay gap
country[, fmean(pago_bruto[genero == 'hombre'])/fmean(pago_bruto[genero == 'mujer']), by = .(anyo)]



### gender directivo gap (women should be ~40-44% of directivos)
country[,uniqueN(person_id[grado == "directivo" &  genero == 'mujer'])/
             uniqueN(person_id[grado == "directivo"]),
           by = .(anyo)]



### final checks
pr_isna(country$entidad_nombre) # any NAs still there?

# security etc organizations out for good?
org1 = funique(country$entidad_nombre)
sf(grepl('estado mayor conjunto', org1)) # out
sf(grepl('í', org1)) # out ---> Spanish diacritics (? the most common)

sf(grepl('direccion de prevision de carabineros de country', org1)) # in
sf(grepl('fiscalia nacional economica', org1)) # in
sf(grepl('fiscalia de obras publicas', org1)) # in 
sf(grepl('de las fuerzas armadas', org1)) # in (for dashboard)
sf(grepl('genda', org1)) # out

beep('complete')



### >> save (4 versions) ---------------------------------------------------------------------------------------------------------------------------------

print('saving')
gc()
beep()

write_flex(country, file.path('data', 'clean', 'country_full_final'), format = format1) # full dataset
gc()


### save subsets?
anew_subset = F

if(anew_subset){
  
  if(!exists('country')){
    country = read_flex(file.path('data', 'clean', 'country_full_final'), format = format1)  
  }
  
  write_flex(country[anyo == 2024], file.path('data', 'clean', 'country_full_final24'), format = format1) # only 2024
  gc()
  
  write_flex(country[cubertura =='completo'], file.path('data', 'clean', 'country_full_final_completo'), format = format1) # only orgs with complete coverage
  gc()
  
  write_flex(country[anyo == 2024 & cubertura =='completo'], file.path('data', 'clean', 'country_full_final_completo24'), format = format1) # only 2024 AND orgs with complete coverage
  gc()
}


beep('complete')
exec_time_fun('exec_time')



#
# FIN DEL CÓDIGO  --------------------------------------------------------------------------------------------
# 